import pandas as pd
import seaborn as sns #visulasiation
from langdetect import detect #language detection
import matplotlib.pyplot as plt
import string #for punctuation check
import sys #check for encoding
from nltk.corpus import stopwords #for removing stopwords
from textblob import TextBlob # spelling correction
from nltk.stem import WordNetLemmatizer #lemmatizing
from nltk.stem import PorterStemmer #stemming
import numpy as np
import cv2
from PIL import Image
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
df=pd.read_csv('./songs_last.csv')
print('Dataset has {} rows and {} columns'.format(len(df), len(df.columns)))
df.columns
df = df.rename(columns=lambda x: x.replace('ID', 'Link')) # changing column name 'id' as 'link'
df.dtypes
df.Category.unique()
df.head()
Number of missing values per columns :
print("\nNumber of missing values:\n",df.isnull().sum())
Removing rows that have any missing value:
df.dropna(inplace=True)
print('Dataset has {} rows and {} columns'.format(len(df), len(df.columns)))
Number of unique value per attribute
print ("Unique values :\n",df.nunique())
Above table shows the number of unique elements by each attribute. The interpretation of this table is that each unique song is not categoriezed by only one genre. Therefore, multi label classification algorithms should apply for the classification of lyrics by genre. And the data structure should be adapted to be able to apply multilabel algorithms.
#sample for duplicated songs
df.loc[df['Name'] == 'Mr. Brightside']
#create unique ID per song
df['ID'] = pd.factorize(df.Link)[0]
cols = list(df.columns)
cols = [cols[-1]] + cols[:-1]
df = df[cols]
#set index per song as ID
df=df.set_index(df.columns[0])
df.head()
#get dummies
dummy = pd.get_dummies(df['Category'])
dummy.head()
dummy = dummy.rename(columns=lambda x: x.replace('hip-hop', 'hiphop'))
dfd = pd.concat([df, dummy], axis=1)
dfd.drop(['Category', 'Link'], inplace=True, axis=1)
dfd.head()
dfd.shape
#convert dummies to multilabel dummies
dum=dfd.groupby(dfd.index).sum()
dum.head()
print(len(dum))
dfd = dfd.loc[~dfd.index.duplicated(keep='first')] # removing dublicated songs
print(len(dfd))
dfu=dfd.drop(list(dfd)[3:15], axis=1) # remove single labelled dummies
df = pd.concat([dfu, dum], axis=1) #concatenate multi labelled dummies with rest of dataset
df.head()
df.shape
df.nunique()
df.alternative.unique()
df.loc[df['Name'] == 'Mr. Brightside']
It is surprised to see value of 2 in any dummy columns. However, this proves that some songs are duplicated on genre track list of last.fm. For example; if you go to link of alternative track list https://www.last.fm/tag/alternative/tracks?page=1 we see the 'Mr.Brightside' from The Killers as 22. on the list but there is exactly same song on 353. row.
df.loc[df['alternative'] == 2, 'alternative'] = 1
df.loc[df['country'] == 2, 'country'] = 1
df.loc[df['electronic'] == 2, 'electronic'] = 1
df.loc[df['folk'] == 2, 'folk'] = 1
df.loc[df['hiphop'] == 2, 'hiphop'] = 1
df.loc[df['house'] == 2, 'house'] = 1
df.loc[df['indie'] == 2, 'indie'] = 1
df.loc[df['metal'] == 2, 'metal'] = 1
df.loc[df['jazz'] == 2, 'jazz'] = 1
df.loc[df['pop'] == 2, 'pop'] = 1
df.loc[df['rap'] == 2, 'rap'] = 1
df.loc[df['rnb'] == 2, 'rnb'] = 1
df.nunique()
df.loc[df['Name'] == 'Intro'] # dataset still contains some instrumental songs
df.shape
df['word_count'] = df['Lyrics'].str.split().str.len()
outliers = df.loc[df['word_count'] <= 10]
outliers.head()
len(outliers)
First of all, instrumental songs should be excluded because of there is no lyrics for analysis and outliers which has less than 10 words should be elminated aswell. So elimintate the songs below than 10-words and above than 1300-words.
#remove outliers
df = df[df['word_count'] > 10]
df = df[df['word_count'] <= 1300]
sns.violinplot(x=df["word_count"])
len(df) # number of samples left
df['Language'] = df['Lyrics'].apply(detect)
df[df['Language'] != 'en'].head(5) # langdetect is not accurate 100%
df['Language'].value_counts()
df.loc[df['Name'] == 'Radioactivity'] # this song contains other languages but is assigned to english
non_english=df[df['Language'] != 'en']
non_english.to_csv('non_english.csv', index=False, encoding="utf-8")
print('There are {} songs which are not in English.'.format(len(df[df['Language'] != 'en'])))
df = df[df['Language'] == 'en']
print('Songs are not in English are eliminated!')
print('Now dataset has {} rows.'.format(len(df)))
mask = df.Lyrics.duplicated(keep=False)
duplicated=df[mask]
duplicated
These songs have exactly same lyrics but are considered as different songs. For example; https://www.last.fm//music/Nelly/_/Country+Grammar and https://www.last.fm//music/Nelly/_/Country+Grammar+(Hot...) This finding shows that there are factors other than lyrics that play a significant role in defining song genres. The assumption is that the genre of the songs may differs by audios features such as beat, rhythm, timbre. However, that proves that the lyrics of a song itself is not enough to decide on its genre, and it restricts this research. In order to not to reduce the accuracy of algorithms, these songs are excluded from this study.
df = df.drop_duplicates(subset='Lyrics', keep='first')
len(df)
genres = list(df.columns.values)
genres = genres[3:15]
print(genres)
# Calculating number of songs in each genre
counts = []
for genre in genres:
counts.append((genre, df[genre].sum()))
df_stats = pd.DataFrame(counts, columns=['Genre', 'number of songs'])
df_stats
#Number of Songs per genre
categories = list(df.columns.values)
cat=categories[3:15]
sns.set(font_scale = 2)
plt.figure(figsize=(18,6))
a=df.iloc[:,3:15].sum().values
ax= sns.barplot(cat, a)
plt.title("Songs per Genre", fontsize=25)
plt.ylabel('Number of songs', fontsize=20)
#adding the text labels
rects = ax.patches
labels = a
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
plt.savefig('songspergenre.png')
plt.show()
rowSums = df.iloc[:, 3:15].sum(axis=1)
multiLabel_counts = rowSums.value_counts()
multiLabel_counts
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))
ax = sns.barplot(multiLabel_counts.index, multiLabel_counts.values)
plt.title("Songs categorizing multiple genre ")
plt.ylabel('Number of songs', fontsize=18)
plt.xlabel('Number of genre', fontsize=18)
#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.savefig('songsmultigenre.png')
plt.show()
plt.figure(figsize = (20,8))
sns.heatmap((df.loc[:, ['alternative','country','electronic','folk','hiphop','house','indie','jazz','metal','pop','rap','rnb']]).corr(),
annot=True,linewidths=0.0, fmt='.2f')
#song per artist
artist = df["Singer"].value_counts()
artist.head(10)
len(df)
string.punctuation
df.loc[9,'Lyrics']
df['Lyrics1'] = df['Lyrics'].str.replace("'", '') # önce tirnak isaretini kaldirdik araya bosluk koymasin diye
df.loc[9,'Lyrics1']
df.loc[8,'Lyrics1']
df.loc[3,'Lyrics1']
df['Lyrics1'] = df['Lyrics1'].str.replace('[^\w\s]',' ').str.replace(' ', ' ').str.replace(' ', ' ').str.replace(' ', ' ').str.replace(' ', ' ')
df.loc[9,'Lyrics1']
df['Lyrics1'] = df['Lyrics1'].str.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+').apply(' '.join)
df.loc[8,'Lyrics1']
df.loc[3,'Lyrics1']
df['Lyrics1'] = df['Lyrics1'].apply(lambda x: " ".join(x for x in str(x).split() if x.isalpha()))
df.loc[2202,'Lyrics1']
Removing non-alpha characters do not work for japanese characters, therefore non-ascii characters can be removed
#Check for encoding
print(sys.getdefaultencoding())
df.loc[2202,'Lyrics1']
df.loc[2765,'Lyrics1']
Since we have non-ascii characters in our data, we remove them.
df['Lyrics1']=df['Lyrics1'].apply(lambda x: x.encode('ascii', 'replace').decode().replace("?", ""))
df['Lyrics1']=df['Lyrics1'].apply(lambda x: " ".join(x for x in str(x).split()))
df.loc[2202,'Lyrics1']
df['Lyrics1'] = df['Lyrics1'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
df.loc[9,'Lyrics1']
df.loc[8,'Lyrics1']
df['Lyrics1'] = df['Lyrics1'].str.replace(" dont ", ' do not ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" im ", ' i am ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" youre ", ' you are ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" cant ", ' can not ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" wont ", ' will not ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" wasnt ", ' was not ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" gonna ", ' going to ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" theres ", ' there is ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" youd ", ' you would ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" heres ", ' here is ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" ill ", ' i will ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" ll ", ' will ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" gotta ", ' have got to ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" youve ", ' you have ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" aint ", ' not ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" ive ", ' i have ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" wanna ", ' want to ')
df['Lyrics1'] = df['Lyrics1'].str.replace(" theyre ", ' they are ')
Manually normalization has been implemented. Because when I remove stopwords such as I, am , do ,not. They are supposed to be removed but if they appear in their short way like dont, im, etc they would stay because they are not written in correct way so that they are not in the stopwords list eventhough they have exactly same meaning.
df.loc[8,'Lyrics1']
df.loc[9,'Lyrics1']
stop = stopwords.words('english')
df['Lyrics1'] = df['Lyrics1'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
df.loc[9,'Lyrics1']
df.loc[8,'Lyrics1']
df.loc[1879,'Lyrics1']
df.loc[8,'Lyrics1']
Test1 = TextBlob("I tnd to spel wrds incorrectly")
Test1.correct()
df.loc[8,'Lyrics1']
a=df.loc[8,'Lyrics1']
Test1 = TextBlob(a)
Test1.correct()
Spelling correction did not work well. For example, 'wanna' updated as a 'anna' by TextBlob so that we do not use this.
lemmatizer = WordNetLemmatizer()
df['Lyrics1'] = df['Lyrics1'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in x.split()]))
df.loc[8,'Lyrics1']
ps = PorterStemmer()
df['Lyrics2'] = df['Lyrics1'].apply(lambda x: " ".join([ps.stem(w) for w in x.split()]))
df.loc[8,'Lyrics2']
Stemming modifies 'crazy' as 'crazi', 'baby' as 'babi' and 'romance' as 'romanc'. So that I have decided not to use stemming.
df=df.drop(['Lyrics2'], axis=1)
df.head()
#Word count in each song:
df['count_word']=df["Lyrics1"].apply(lambda x: len(str(x).split()))
#Unique word count
df['count_unique_word']=df["Lyrics1"].apply(lambda x: len(set(str(x).split())))
df.head(2)
#Create subsets
jazz=df[df['jazz'] ==1 ].count_word
pop=df[df['pop'] ==1 ].count_word
rnb=df[df['rnb'] ==1 ].count_word
rap=df[df['rap'] ==1 ].count_word
hiphop=df[df['hiphop'] ==1 ].count_word
folk=df[df['folk'] ==1 ].count_word
alternative=df[df['alternative'] ==1 ].count_word
house=df[df['house'] ==1 ].count_word
electronic=df[df['electronic'] ==1 ].count_word
metal=df[df['metal'] ==1 ].count_word
indie=df[df['indie'] ==1 ].count_word
country=df[df['country'] ==1 ].count_word
## combine these different collections into a list
data_plot = [jazz,pop,rnb,rap,hiphop,folk,alternative,house,indie,electronic,metal,country]
# Create a figure instance
fig = plt.figure(1, figsize=(20, 8))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(data_plot)
## add patch_artist=True option to ax.boxplot()
## to get fill color
bp = ax.boxplot(data_plot,widths=0.6, patch_artist=True)
#ax.set_ylim(ymax=1300)
## change outline color, fill color and linewidth of the boxes
for box in bp['boxes']:
# change outline color
box.set( color='#ac7339', linewidth=2)
# change fill color
box.set( facecolor = '#ffcc00' )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#7570b3', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#ffcc00', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#ac7339', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='o', color='#ac7339', alpha=0.5)
## Custom x-axis labels
ax.set_xticklabels(['jazz', 'pop', 'rnb', 'rap','hiphop','folk', 'alternative','house','indie','electronic','metal','country'])
ax.set_title("Counts of Words")
## Remove top axes and right axes ticks
#ax.get_xaxis().tick_bottom()
#ax.get_yaxis().tick_left()
# Save the figure
#fig.savefig('fig.png', bbox_inches='tight')
#Create subsets
jazz=df[df['jazz'] ==1 ].count_unique_word
pop=df[df['pop'] ==1 ].count_unique_word
rnb=df[df['rnb'] ==1 ].count_unique_word
rap=df[df['rap'] ==1 ].count_unique_word
hiphop=df[df['hiphop'] ==1 ].count_unique_word
folk=df[df['folk'] ==1 ].count_unique_word
alternative=df[df['alternative'] ==1 ].count_unique_word
house=df[df['house'] ==1 ].count_unique_word
electronic=df[df['electronic'] ==1 ].count_unique_word
metal=df[df['metal'] ==1 ].count_unique_word
indie=df[df['indie'] ==1 ].count_unique_word
country=df[df['country'] ==1 ].count_unique_word
## combine these different collections into a list
data_to_plot = [jazz,pop,rnb,rap,hiphop,folk,alternative,house,indie,electronic,metal,country]
# Create a figure instance
fig = plt.figure(1, figsize=(20, 8))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(data_to_plot)
## add patch_artist=True option to ax.boxplot()
## to get fill color
bp = ax.boxplot(data_to_plot,widths=0.6, patch_artist=True)
#ax.set_ylim(ymax=1300)
## change outline color, fill color and linewidth of the boxes
for box in bp['boxes']:
# change outline color
box.set( color='#ac7339', linewidth=2)
# change fill color
box.set( facecolor = '#ffcc00' )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#7570b3', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#ffcc00', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#ac7339', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='o', color='#ac7339', alpha=0.5)
## Custom x-axis labels
ax.set_xticklabels(['jazz', 'pop', 'rnb', 'rap','hiphop','folk', 'alternative','house','indie','electronic','metal','country'])
ax.set_title('Lexical Diversity')
## Remove top axes and right axes ticks
#ax.get_xaxis().tick_bottom()
#ax.get_yaxis().tick_left()
# Save the figure
fig.savefig('fig.png', bbox_inches='tight')
#derived features
#Word count percent in each song:
df['word_unique_percent']=df['count_unique_word']*100/df['count_word']
#plt.subplot2grid((2,2),(0,1))
sns.color_palette("bright")
plt.figure(figsize=(18,10))
plt.title("Lexial Density")
#sns.boxplot(x='clean', y='word_unique_percent', data=train_feats)
ax=sns.kdeplot(df['word_unique_percent'], label="Average",shade=True,color='r')
ax=sns.kdeplot(df[df['pop'] == 1].word_unique_percent, label="Pop")
ax=sns.kdeplot(df[df['folk'] == 1].word_unique_percent, label="Folk",color='b')
ax=sns.kdeplot(df[df['alternative'] == 1].word_unique_percent, label="Alternative", color="g")
ax=sns.kdeplot(df[df['metal'] == 1].word_unique_percent, label="Metal", color='black')
ax=sns.kdeplot(df[df['rnb'] == 1].word_unique_percent, label="RnB", color="#2ecc71")
ax=sns.kdeplot(df[df['hiphop'] == 1].word_unique_percent, label="Hip-Hop", color='#34495e')
ax=sns.kdeplot(df[df['country'] == 1].word_unique_percent, label="Country", color='fuchsia')
ax=sns.kdeplot(df[df['rap'] == 1].word_unique_percent, label="Rap", color='orange')
ax=sns.kdeplot(df[df['electronic'] == 1].word_unique_percent, label="Electronic")
ax=sns.kdeplot(df[df['indie'] == 1].word_unique_percent, label="Indie")
ax=sns.kdeplot(df[df['jazz'] == 1].word_unique_percent, label="Jazz")
ax=sns.kdeplot(df[df['house'] == 1].word_unique_percent, label="House", color='navy')
#ax=sns.kdeplot(train_feats[train_feats.clean == 1].word_unique_percent, label="Clean")
plt.legend()
#plt.ylabel('Number of occurances', fontsize=12)
plt.xlabel('Percent unique words', fontsize=12)
plt.show()
#Create subsets
jazz=df[df['jazz'] ==1 ].word_unique_percent
pop=df[df['pop'] ==1 ].word_unique_percent
rnb=df[df['rnb'] ==1 ].word_unique_percent
rap=df[df['rap'] ==1 ].word_unique_percent
hiphop=df[df['hiphop'] ==1 ].word_unique_percent
folk=df[df['folk'] ==1 ].word_unique_percent
alternative=df[df['alternative'] ==1 ].word_unique_percent
house=df[df['house'] ==1 ].word_unique_percent
electronic=df[df['electronic'] ==1 ].word_unique_percent
metal=df[df['metal'] ==1 ].word_unique_percent
indie=df[df['indie'] ==1 ].word_unique_percent
country=df[df['country'] ==1 ].word_unique_percent
## combine these different collections into a list
data_plot = [jazz,pop,rnb,rap,hiphop,folk,alternative,house,indie,electronic,metal,country]
# Create a figure instance
fig = plt.figure(1, figsize=(20, 8))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(data_plot)
## add patch_artist=True option to ax.boxplot()
## to get fill color
bp = ax.boxplot(data_plot,widths=0.6, patch_artist=True)
#ax.set_ylim(ymax=1300)
## change outline color, fill color and linewidth of the boxes
for box in bp['boxes']:
# change outline color
box.set( color='#ac7339', linewidth=2)
# change fill color
box.set( facecolor = '#ffcc00' )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#7570b3', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#ffcc00', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#ac7339', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='o', color='#ac7339', alpha=0.5)
## Custom x-axis labels
ax.set_xticklabels(['jazz', 'pop', 'rnb', 'rap','hiphop','folk', 'alternative','house','indie','electronic','metal','country'])
ax.set_title("Lexical Density")
## Remove top axes and right axes ticks
#ax.get_xaxis().tick_bottom()
#ax.get_yaxis().tick_left()
# Save the figure
#fig.savefig('fig.png', bbox_inches='tight')
corpus=df['Lyrics1']
def get_top_n_words(corpus, n=None):
vec = CountVectorizer(stop_words = 'english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_words(df['Lyrics1'], 20)
#for word, freq in common_words:
#print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['Common_Words' , 'count'])
sns.set(font_scale = 2)
plt.figure(figsize=(20,8))
#ax = sns.countplot(x="Common_Words", data=df2)
ax= sns.barplot(df2['Common_Words'], df2['count'], palette='Blues_d')
plt.title("Top 20 unigrams in lyrics", fontsize=25)
plt.ylabel('Count', fontsize=20)
plt.xlabel('')
rects = ax.patches
labels = df2['count']
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
plt.setp(ax.get_xticklabels(), rotation=45)
ax.set(yticklabels=[])
plt.show()
corpus=df['Lyrics1']
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(df['Lyrics1'], 20)
df3 = pd.DataFrame(common_words, columns = ['Common_Words' , 'count'])
sns.set(font_scale = 2)
plt.figure(figsize=(20,8))
#ax = sns.countplot(x="Common_Words", data=df2)
ax= sns.barplot(df3['Common_Words'], df3['count'], palette='Greens_d')
plt.title("Top 20 bigrams in lyrics", fontsize=25)
plt.ylabel('Count', fontsize=20)
plt.xlabel('')
rects = ax.patches
labels = df3['count']
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
plt.setp(ax.get_xticklabels(), rotation=60)
#sns.despine(left=True)
ax.set(yticklabels=[])
plt.show()
corpus=df['Lyrics1']
def get_top_n_trigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_trigram(df['Lyrics1'], 20)
df4 = pd.DataFrame(common_words, columns = ['Common_Words' , 'count'])
sns.set(font_scale = 2)
plt.figure(figsize=(20,8))
#ax = sns.countplot(x="Common_Words", data=df2)
ax= sns.barplot(df4['Common_Words'], df4['count'], palette='Reds_d')
plt.title("Top 20 trigrams in lyrics", fontsize=25)
plt.ylabel('Count', fontsize=20)
plt.xlabel('')
rects = ax.patches
labels = df4['count']
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
plt.setp(ax.get_xticklabels(), rotation=80)
ax.set(yticklabels=[])
plt.show()
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state=42, test_size=0.30, shuffle=True)
print(train.shape)
print(test.shape)
train_text = train['Lyrics1']
test_text = test['Lyrics1']
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2')
x_train=vec.fit_transform(train_text) # is used in order to learn vocaulary and idf, return term-document matrix
x_test=vec.transform(test_text) # is used in order to learn vocabulary and idf from training set.
y_train=train.iloc[:,3:15]
y_test= test.iloc[:,3:15]
print(type(x_train))
print(type(y_train))
print(type(x_test))
print(type(y_test))
x_train
x_test
y_train.shape
#import libraries
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from skmultilearn.adapt import MLkNN
from skmultilearn.adapt import BRkNNaClassifier
from sklearn.naive_bayes import GaussianNB
from skmultilearn.ensemble import RakelD
from sklearn.model_selection import GridSearchCV
import time
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
import scipy
import time
parameters = [
{
'classifier': [LogisticRegression(random_state=42, class_weight='balanced')],
'classifier__penalty': ['l1', 'l2'],# l1 lasso l2 ridge
'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__max_iter': [10, 50, 100, 150]
},
]
clf = GridSearchCV(BinaryRelevance(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameter; C: {}, Maximum Iteration: {}, Penalty: {} '.format(clf.best_params_['classifier__C'],
clf.best_params_['classifier__max_iter'],
clf.best_params_['classifier__penalty']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_BRLR= clf.best_estimator_.predict(x_test)
print('The best model from grid-search for BR Logistic Regression has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_BRLR),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_BRLR),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_BRLR, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_BRLR.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_BRLR.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_BRLR.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_BRLR.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_BRLR.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_BRLR, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_BRLR, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_BRLR, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_BRLR, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_BRLR, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_BRLR, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_BRLR, average='weighted'),3))
print('Test accuracy per genre:\n{}'.format(np.mean(y_BRLR.toarray() == y_test)))
from sklearn.metrics import classification_report
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_BRLR.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_BRLR.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [LinearSVC(random_state=42, class_weight='balanced')],
'classifier__penalty': ['l2'],
'classifier__C': [1, 10,100],
'classifier__dual':[False,True],
'classifier__max_iter': [10, 50, 100, 150],
},
]
clf = GridSearchCV(BinaryRelevance(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameters; C: {}, Maximum Iteration: {}, Dual: {}'.format(clf.best_params_['classifier__C'],clf.best_params_['classifier__max_iter'],
clf.best_params_['classifier__dual']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_BRLSVC= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for BR Linear SVC has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_BRLSVC),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_BRLSVC),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_BRLSVC, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_BRLSVC.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_BRLSVC.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_BRLSVC.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_BRLSVC.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_BRLSVC.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_BRLSVC, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_BRLSVC, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_BRLSVC, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_BRLSVC, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_BRLSVC, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_BRLSVC, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_BRLSVC, average='weighted'),3))
print('Test accuracy per genre: \n{}'.format(np.mean(y_BRLSVC.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_BRLSVC.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i],y_BRLSVC.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [RandomForestClassifier(random_state=42, class_weight='balanced')],
'classifier__criterion': ['gini', 'entropy'],
'classifier__n_estimators': [10, 50, 100, 150],
'classifier__max_depth': [5,3, None],
},
]
clf = GridSearchCV(BinaryRelevance(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameters; Criterion: {}, N Estimators: {}, Max Dept: {}'.format(clf.best_params_['classifier__criterion'],
clf.best_params_['classifier__n_estimators'], clf.best_params_['classifier__max_depth']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_BRRF= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for BR Random Forest has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_BRRF),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_BRRF),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_BRRF, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_BRRF.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_BRRF.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_BRRF.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_BRRF.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_BRRF.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_BRRF, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_BRRF, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_BRRF, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_BRRF, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_BRRF, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_BRRF, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_BRRF, average='weighted'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_BRRF.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_BRRF.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_BRRF.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [GradientBoostingClassifier(random_state=42)],
'classifier__n_estimators': [50,70,100], #number of tree , higher better but complex
'classifier__max_depth': [3,5], #how deep trees
'classifier__learning_rate': [0.01, 0.1, 0.5], #high learning rate results in overfitting
},
]
clf_BRGB = GridSearchCV(BinaryRelevance(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted')
start= time.time()
#train
clf_BRGB.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf_BRGB.best_score_,3))
print('Tuned HyperParameters; N Estimators: {}, Tree Depth: {}, Learning Rate: {},'.format(clf_BRGB.best_params_['classifier__n_estimators'],
clf_BRGB.best_params_['classifier__max_depth'],
clf_BRGB.best_params_['classifier__learning_rate']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_BRGB= clf_BRGB.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for BR Gradient Boosting has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_BRGB),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_BRGB),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_BRGB, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_BRGB.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_BRGB.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_BRGB.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_BRGB.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_BRGB.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_BRGB, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_BRGB, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_BRGB, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_BRGB, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_BRGB, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_BRGB, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_BRGB, average='weighted'),3))
print('Test accuracy per genre: \n{}'.format(np.mean(y_BRGB.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_BRGB.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_BRGB.toarray()[:,i]))
print('')
#there is not hyper-parameter to tune
clf_NB= BinaryRelevance(GaussianNB())
start= time.time()
#train
clf_NB.fit(x_train, y_train)
end=time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_BRNB=clf_NB.predict(x_test)
#evaluation measures
print('BR Gaussian Naive Bayes has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_BRNB),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_BRNB),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_BRNB, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_BRNB.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_BRNB.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_BRNB.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_BRNB.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_BRNB.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_BRNB, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_BRNB, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_BRNB, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_BRNB, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_BRNB, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_BRNB, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_BRNB, average='weighted'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_BRNB.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_BRNB.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_BRNB.toarray()[:,i]))
print('')
#hyperparameters
parameters = [
{
'classifier': [LogisticRegression(random_state=42, class_weight='balanced')],
'classifier__penalty': ['l1', 'l2'],# l1 lasso l2 ridge
'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__max_iter': [10, 50, 100, 150]
},
]
clf = GridSearchCV(ClassifierChain(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameter; C: {}, Maximum Iteration: {}, Penalty: {} '.format(clf.best_params_['classifier__C'],
clf.best_params_['classifier__max_iter'],
clf.best_params_['classifier__penalty']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_CCLR= clf.best_estimator_.predict(x_test)
print('The best model from grid-search for CC Logistic Regression has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_CCLR),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_CCLR),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_CCLR, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_CCLR.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_CCLR.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_CCLR.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_CCLR.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_CCLR.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_CCLR, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_CCLR, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_CCLR, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_CCLR, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_CCLR, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_CCLR, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_CCLR, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_CCLR, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_CCLR, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_CCLR, average='macro'),3))
print('Test accuracy is:\n{}'.format(np.mean(y_CCLR.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i],y_CCLR.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_CCLR.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [LinearSVC(random_state=42, class_weight='balanced')],
'classifier__penalty': ['l2'],
'classifier__C': [1, 10,100],
'classifier__dual':[False,True],
'classifier__max_iter': [10, 50, 100, 150],
},
]
clf = GridSearchCV(ClassifierChain(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameters; C: {}, Maximum Iteration: {}, Dual: {}'.format(clf.best_params_['classifier__C'],clf.best_params_['classifier__max_iter'],
clf.best_params_['classifier__dual']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_CCLSVC= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for CC Linear SVC has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_CCLSVC),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_CCLSVC),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_CCLSVC, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_CCLSVC.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_CCLSVC.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_CCLSVC.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_CCLSVC.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_CCLSVC.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_CCLSVC, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_CCLSVC, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_CCLSVC, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_CCLSVC, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_CCLSVC, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_CCLSVC, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_CCLSVC, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_CCLSVC, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_CCLSVC, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_CCLSVC, average='macro'),3))
print('Test accuracy is: {}'.format(np.mean(y_CCLSVC.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_CCLSVC.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i],y_CCLSVC.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [RandomForestClassifier(random_state=42, class_weight='balanced')],
'classifier__criterion': ['gini', 'entropy'],
'classifier__n_estimators': [10, 100, 150, 500],
'classifier__max_depth': [5,3, None],
},
]
clf = GridSearchCV(ClassifierChain(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameters; Criterion: {}, N Estimators: {}, Max Depth: {}'.format(clf.best_params_['classifier__criterion'],
clf.best_params_['classifier__n_estimators'], clf.best_params_['classifier__max_depth']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_CCRF= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for CC Random Forest has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_CCRF),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_CCRF),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_CCRF, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_CCRF.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_CCRF.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_CCRF.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_CCRF.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_CCRF.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_CCRF, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_CCRF, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_CCRF, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_CCRF, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_CCRF, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_CCRF, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_CCRF, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_CCRF, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_CCRF, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_CCRF, average='macro'),3))
print('Test accuracy is:\n{}'.format(np.mean(y_CCRF.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_CCRF.toarray()[0:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_CCRF.toarray()[0:,i]))
print('')
parameters = [
{
'classifier': [GradientBoostingClassifier(random_state=42)],
'classifier__n_estimators': [50,70,100], #number of tree , higher better but complex
'classifier__max_depth': [3,5], #how deep trees
'classifier__learning_rate': [0.01, 0.1, 0.5], #high learning rate results in overfitting
},
]
clf_CCGB = GridSearchCV(ClassifierChain(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted')
start= time.time()
#train
clf_CCGB.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf_CCGB.best_score_,3))
print('Tuned HyperParameters; N Estimators: {}, Tree Depth: {}, Learning Rate: {},'.format(clf_CCGB.best_params_['classifier__n_estimators'],
clf_CCGB.best_params_['classifier__max_depth'],
clf_CCGB.best_params_['classifier__learning_rate']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_CCGB= clf_CCGB.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for CC Gradient Boosting has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_CCGB),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_CCGB),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_CCGB, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_CCGB.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_CCGB.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_CCGB.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_CCGB.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_CCGB.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_CCGB, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_CCGB, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_CCGB, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_CCGB, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_CCGB, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_CCGB, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_CCGB, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_CCGB, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_CCGB, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_CCGB, average='macro'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_CCGB.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_CCGB.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_CCGB.toarray()[:,i]))
print('')
#there is not hyper-parameter to tune
clf_CCNB= ClassifierChain(GaussianNB())
start= time.time()
#train
clf_CCNB.fit(x_train, y_train)
end=time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_CCNB=clf_CCNB.predict(x_test)
#evaluation measures
print('CC Gaussian Naive Bayes has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_CCNB),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_CCNB),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_CCNB, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_CCNB.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_CCNB.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_CCNB.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_CCNB.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_CCNB.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_CCNB, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_CCNB, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_CCNB, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_CCNB, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_CCNB, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_CCNB, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_CCNB, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_CCNB, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_CCNB, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_CCNB, average='macro'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_CCNB.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_CCNB.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_CCNB.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [LogisticRegression(random_state=42, class_weight='balanced')],
'classifier__penalty': ['l1', 'l2'],# l1 lasso l2 ridge
'classifier__C': [0.01, 0.1, 1, 10, 100],
'classifier__max_iter': [10, 50, 100, 150]
},
]
clf = GridSearchCV(LabelPowerset(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameter; C: {}, Maximum Iteration: {}, Penalty: {} '.format(clf.best_params_['classifier__C'],
clf.best_params_['classifier__max_iter'],
clf.best_params_['classifier__penalty']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_LPLR= clf.best_estimator_.predict(x_test)
print('The best model from grid-search for LP Logistic Regression has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_LPLR),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_LPLR),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_LPLR, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_LPLR.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_LPLR.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_LPLR.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_LPLR.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_LPLR.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_LPLR, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_LPLR, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_LPLR, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_LPLR, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_LPLR, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_LPLR, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_LPLR, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_LPLR, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_LPLR, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_LPLR, average='macro'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_LPLR.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_LPLR.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_LPLR.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [LinearSVC(random_state=42, class_weight='balanced')],
'classifier__penalty': ['l2'],
'classifier__C': [1, 10,100],
'classifier__dual':[False,True],
'classifier__max_iter': [10, 50, 100, 150],
},
]
clf = GridSearchCV(ClassifierChain(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameters; C: {}, Maximum Iteration: {}, Dual: {}'.format(clf.best_params_['classifier__C'],clf.best_params_['classifier__max_iter'],
clf.best_params_['classifier__dual']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_LPLSVC= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for LP Linear SVC has:\n')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_LPLSVC),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_LPLSVC),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_LPLSVC, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_LPLSVC.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_LPLSVC.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_LPLSVC.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_LPLSVC.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_LPLSVC.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_LPLSVC, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_LPLSVC, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_LPLSVC, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_LPLSVC, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_LPLSVC, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_LPLSVC, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_LPLSVC, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_LPLSVC, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_LPLSVC, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_LPLSVC, average='macro'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_LPLSVC.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i],y_LPLSVC.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_LPLSVC.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [RandomForestClassifier(random_state=42, class_weight='balanced')],
'classifier__criterion': ['gini', 'entropy'],
'classifier__n_estimators': [10, 50, 100, 150],
'classifier__max_depth': [5,3, None],
},
]
clf = GridSearchCV(BinaryRelevance(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted', n_jobs=-1)
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameters; Criterion: {}, N Estimators: {}, Max Dept: {}'.format(clf.best_params_['classifier__criterion'],
clf.best_params_['classifier__n_estimators'], clf.best_params_['classifier__max_depth']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_LPRF= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for LP Random Forest has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_LPRF),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_LPRF),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_LPRF, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_LPRF.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_LPRF.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_LPRF.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_LPRF.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_LPRF.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_LPRF, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_LPRF, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_LPRF, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_LPRF, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_LPRF, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_LPRF, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_LPRF, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_LPRF, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_LPRF, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_LPRF, average='macro'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_LPRF.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_LPRF.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_LPRF.toarray()[:,i]))
print('')
parameters = [
{
'classifier': [GradientBoostingClassifier(random_state=42)],
'classifier__n_estimators': [50,70,100], #number of tree , higher better but complex
'classifier__max_depth': [3,5], #how deep trees
'classifier__learning_rate': [0.01, 0.1, 0.5], #high learning rate results in overfitting
},
]
clf_LPGB = GridSearchCV(BinaryRelevance(require_dense=[False,True]), parameters, cv=5, scoring='f1_weighted')
start= time.time()
#train
clf_LPGB.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf_BRGB.best_score_,3))
print('Tuned HyperParameters; N Estimators: {}, Tree Depth: {}, Learning Rate: {},'.format(clf_LPGB.best_params_['classifier__n_estimators'],
clf_LPGB.best_params_['classifier__max_depth'],
clf_LPGB.best_params_['classifier__learning_rate']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_LPGB= clf_LPGB.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for LP Gradient Boosting has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_LPGB),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_LPGB),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_LPGB, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_LPGB.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_LPGB.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_LPGB.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_LPGB.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_LPGB.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_LPGB, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_LPGB, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_LPGB, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_LPGB, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_LPGB, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_LPGB, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_LPGB, average='weighted'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_LPGB.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_LPGB.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_LPGB.toarray()[:,i]))
print('')
#there is not hyper-parameter to tune
clf_LPNB= LabelPowerset(GaussianNB())
start= time.time()
#train
clf_LPNB.fit(x_train, y_train)
end=time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_LPNB=clf_LPNB.predict(x_test)
#evaluation measures
print('LP Gaussian Naive Bayes has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_LPNB),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_LPNB),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_LPNB, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_LPNB.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_LPNB.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_LPNB.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_LPNB.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_LPNB.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_LPNB, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_LPNB, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_LPNB, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_LPNB, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_LPNB, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_LPNB, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_LPNB, average='weighted'),3))
print('Macro-F1-score:',round(metrics.f1_score(y_test,y_LPNB, average='macro'),3))
print('Macro-Recall:',round(metrics.recall_score(y_test,y_LPNB, average='macro'),3))
print('Macro-Precision:',round(metrics.precision_score(y_test,y_LPNB, average='macro'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_LPNB.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_LPNB.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_LPNB.toarray()[:,i]))
print('')
y_tr_sparse=scipy.sparse.csc_matrix(y_train.values)
y_te_sparse=scipy.sparse.csc_matrix(y_test.values)
parameters = {'k': range(1,12), 's': [0.5, 0.7, 1.0]}
score='f1_weighted'
clf = GridSearchCV(MLkNN(), parameters, cv=5, scoring=score)
#clf = GridSearchCV(BRkNNaClassifier(), parameters, scoring=score)
start= time.time()
#train
clf.fit(x_train, y_tr_sparse)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameter; k: {}, s: {} '.format(clf.best_params_['k'],
clf.best_params_['s']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_MLKNN= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for MLkNN has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_MLKNN),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_MLKNN),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_MLKNN, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_MLKNN.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_MLKNN.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_MLKNN.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_MLKNN.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_MLKNN.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_MLKNN, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_MLKNN, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_MLKNN, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_MLKNN, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_MLKNN, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_MLKNN, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_MLKNN, average='weighted'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_MLKNN.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_MLKNN.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_MLKNN.toarray()[:,i]))
print('')
parameters = {'k': range(1,12)}
score='f1_weighted'
clf = GridSearchCV(BRkNNaClassifier(), parameters, cv=5, scoring=score)
start= time.time()
#train
clf.fit(x_train, y_tr_sparse)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameter; k: {}'.format(clf.best_params_['k']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_BRKNN= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for BRkNN has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_BRKNN),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_BRKNN),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_BRKNN, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_BRKNN.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_BRKNN.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_BRKNN.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_BRKNN.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_BRKNN.toarray()),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_BRKNN, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_BRKNN, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_BRKNN, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_BRKNN, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_BRKNN, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_BRKNN, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_BRKNN, average='weighted'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_BRKNN.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i], y_BRKNN.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_BRKNN.toarray()[:,i]))
print('')
parameters = [
{
'labelset_size': [3, 5, 10, 50]
},
]
clf = GridSearchCV(RakelD(base_classifier=GaussianNB(),base_classifier_require_dense=[True, True]), parameters, cv=5, scoring='f1_weighted')
start= time.time()
#train
clf.fit(x_train, y_train)
end=time.time()
print("Best F1-Score: ",round(clf.best_score_,3))
print('Tuned HyperParameter; Labelset Size: {}'.format(clf.best_params_['labelset_size']))
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("Execution time: {:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))
#predict
y_RAKEL= clf.best_estimator_.predict(x_test)
#evaluation measures
print('The best model from grid-search for Rakeld has:')
print('Hamming Loss:',round(metrics.hamming_loss(y_test,y_RAKEL),3)) #the fraction of the wrong labels to the total number of labels
print('Exact-Match Ratio:',round(metrics.accuracy_score(y_test,y_RAKEL),3))
print('Zero One Loss out of 1983:',round(metrics.zero_one_loss(y_test,y_RAKEL, normalize=False),3))# # of instances classified not exactly correct
print('Log-Loss:',round(metrics.log_loss(y_test,y_RAKEL.toarray()),3))
print('Coverage-Error:',round(metrics.coverage_error(y_test,y_RAKEL.toarray()),3))
print('Ranking Loss:',round(metrics.label_ranking_loss(y_test,y_RAKEL.toarray()),3))
print('Label Ranking Average Precision:',round(metrics.label_ranking_average_precision_score(y_test,y_RAKEL.toarray()),3))
print('ROC AUC score:',round(metrics.roc_auc_score(y_test,y_RAKEL.toarray(), average='micro'),3))
print('Beta-F1-score:',round(metrics.fbeta_score(y_test,y_RAKEL, average='weighted', beta=100),3))
print('Micro-Recall:',round(metrics.recall_score(y_test,y_RAKEL, average='micro'),3))
print('Micro-Precision:',round(metrics.precision_score(y_test,y_RAKEL, average='micro'),3))
print('Micro-F1-score:',round(metrics.f1_score(y_test,y_RAKEL, average='micro'),3))
print('Weighted-Recall:',round(metrics.recall_score(y_test,y_RAKEL, average='weighted'),3))
print('Weighted-Precision:',round(metrics.precision_score(y_test,y_RAKEL, average='weighted'),3))
print('Weighted-F1-score:',round(metrics.f1_score(y_test,y_RAKEL, average='weighted'),3))
print('Test accuracy is per genre:\n{}'.format(np.mean(y_RAKEL.toarray() == y_test)))
for i in range(y_test.shape[1]):
print("Confusion Matrix:{}".format(y_test.columns.values[i]))
print(metrics.confusion_matrix(y_test.values[:,i],y_RAKEL.toarray()[:,i]))
print("\nClassification report of {}:".format(y_test.columns.values[i]))
print(classification_report(y_test.values[:,i], y_RAKEL.toarray()[:,i]))
print('')
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("alternative.jpg", img)
return img
#plt.imshow(imageGenre('ALTERNATIVE'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("country.jpg", img)
return img
#plt.imshow(imageGenre('COUNTRY'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("electronic.jpg", img)
return img
#plt.imshow(imageGenre('ELECTRONIC'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500),font, 18,(255,255,255),75)
img = 255-img
#Save image
cv2.imwrite("folk.jpg", img)
return img
plt.imshow(imageGenre('FOLK'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("hiphop.jpg", img)
return img
#plt.imshow(imageGenre('HIP-HOP'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("house.jpg", img)
return img
#plt.imshow(imageGenre('HOUSE'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("Indie.jpg", img)
return img
plt.imshow(imageGenre('INDIE'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("jazz.jpg", img)
return img
plt.imshow(imageGenre('JAZZ'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("Metal.jpg", img)
return img
#plt.imshow(imageGenre('METAL'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("POP.jpg", img)
return img
plt.imshow(imageGenre('POP'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),85)
img = 255-img
#Save image
cv2.imwrite("RAP.jpg", img)
return img
plt.imshow(imageGenre('RAP'))
def imageGenre(s):# Create a black image
cnt = len(s)
img = np.zeros((612,cnt*420,3), np.uint8)
# Write some Text
font = cv2.FONT_HERSHEY_TRIPLEX
cv2.putText(img,s,(0,500), font, 18,(255,255,255),75)
img = 255-img
#Save image
cv2.imwrite("RNB.jpg", img)
return img
#plt.imshow(imageGenre('RNB'))
mask=np.array(Image.open("./ALTERNATIVE.jpg"))
mask=mask[:,:,1]
subset=df[df['alternative'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Alternative", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./COUNTRY.jpg"))
mask=mask[:,:,1]
subset=df[df['country'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Country", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./ELECTRONIC.jpg"))
mask=mask[:,:,1]
subset=df[df['electronic'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Electronic", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./FOLK.jpg"))
mask=mask[:,:,1]
subset=df[df['folk'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Folk", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./HIPHOP.jpg"))
mask=mask[:,:,1]
subset=df[df['hiphop'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Hip-Hop", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./house.jpg"))
mask=mask[:,:,1]
subset=df[df['house'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of House", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./indie.jpg"))
mask=mask[:,:,1]
subset=df[df['indie'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Indie", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./Jazz.jpg"))
mask=mask[:,:,1]
subset=df[df['jazz'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Jazz", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./metal.jpg"))
mask=mask[:,:,1]
subset=df[df['metal'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Metal", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./pop.jpg"))
mask=mask[:,:,1]
subset=df[df['pop'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Pop", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./rap.jpg"))
mask=mask[:,:,1]
subset=df[df['rap'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of Rap", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
mask=np.array(Image.open("./rnb.jpg"))
mask=mask[:,:,1]
subset=df[df['rnb'] ==1 ]
text=subset['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=200,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,5))
plt.axis("off")
#plt.title("Most frequent words of RNB", fontsize=20)
plt.imshow(wc, alpha=0.98)
plt.show()
#most frequent words per genre
corpus=df['Lyrics1']
def get_top_n_words(corpu, n=None):
vec = CountVectorizer(stop_words = 'english').fit(corpu)
bag_of_words = vec.transform(corpu)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
for i in range(3,15):
co=df[df.iloc[:,i] == 1]['Lyrics1']
common_words = get_top_n_words(co, 10)
data = pd.DataFrame(common_words, columns = ['Common_Words' , 'count'])
sns.set(font_scale = 2)
plt.figure(figsize=(10,5))
ax= sns.barplot(data['Common_Words'], data['count'], palette='viridis')
plt.title("Most frequent words of {}".format(df.columns.values[i]))
plt.xlabel('')
rects = ax.patches
labels=data['count']
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
plt.setp(ax.get_xticklabels(), rotation=0)
ax.set(yticklabels=[])
plt.show()
vec = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2', stop_words='english')
corpus=df['Lyrics1']
tfs=vec.fit_transform(corpus)
feature_array = np.array(vec.get_feature_names())
genre=df[df['alternative'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Alternative: \n{}".format(top_n.tolist()))
genre=df[df['country'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Country: \n{}".format(top_n.tolist()))
genre=df[df['electronic'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Electronic: \n{}".format(top_n.tolist()))
genre=df[df['folk'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Folk: \n{}".format(top_n.tolist()))
genre=df[df['hiphop'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Hip-Hop: \n{}".format(top_n.tolist()))
genre=df[df['house'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of House: \n{}".format(top_n.tolist()))
genre=df[df['indie'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Indie: \n{}".format(top_n.tolist()))
genre=df[df['jazz'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Jazz: \n{}".format(top_n.tolist()))
genre=df[df['metal'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Metal: \n{}".format(top_n.tolist()))
genre=df[df['pop'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Pop: \n{}".format(top_n.tolist()))
genre=df[df['rap'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of Rap: \n{}".format(top_n.tolist()))
genre=df[df['rnb'] ==1 ]['Lyrics1']
response = vec.transform(genre)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]
print("Top 10 tf-idf of RNB: \n{}".format(top_n.tolist()))
#max f20 feature per genre
for i in range(3,15):
co=df[df.iloc[:,i] == 1]['Lyrics1']
vectorizer = TfidfVectorizer(max_features=20,strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2')
tfidf=vectorizer.fit_transform(co)
features = np.array(vectorizer.get_feature_names())
print("\nTop features of {}:".format(df.columns.values[i]))
print(features)
Maximum token value throughout whole dataset
text=df['Lyrics1']
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2')
vectorizer.fit(text)
new1 = vectorizer.transform(text)
# find maximum value for each of the features over all of dataset:
max_val = new1.max(axis=0).toarray().ravel()
#sort weights from smallest to biggest and extract their indices
sort_by_tfidf = max_val.argsort()
feature_names = np.array(vectorizer.get_feature_names())
print("Features with lowest tfidf:\n{}".format(
feature_names[sort_by_tfidf[:3]]))
print("\nFeatures with highest tfidf: \n{}".format(
feature_names[sort_by_tfidf[-3:]]))
#overall wordcloud
mask=np.array(Image.open("./music_notes.png"))
text=df['Lyrics1'].values
wc= WordCloud(background_color="white",max_words=5000,mask=mask)
wc.generate(" ".join(text))
plt.figure(figsize=(20,50))
plt.axis("off")
#plt.title("Words frequented in Lyrics", fontsize=20)
plt.imshow(wc.recolor(colormap= 'magma' , random_state=244), alpha=0.98)
plt.savefig('treble_clef.png', bbox_inches='tight')
#POP
pop_mask=np.array(Image.open("./POP.jpg"))
pop_mask=pop_mask[:,:,1]
subset=df[df['pop'] ==1 ]
pop_text=subset['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=pop_mask)
wc.generate(" ".join(pop_text))
plt.figure(figsize=(50,50))
plt.subplot(621)
plt.axis("off")
plt.title("Words frequented in POP Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#RNB
plt.subplot(622)
rnb_mask=np.array(Image.open("./RNB.jpg"))
rnb_mask=rnb_mask[:,:,1]
subset1=df[df['rnb'] ==1 ]
rnb_text=subset1['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=rnb_mask)
wc.generate(" ".join(rnb_text))
plt.axis("off")
plt.title("Words frequented in RNB Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#ALTERNATIVE
plt.subplot(623)
a_mask=np.array(Image.open("./alternative.jpg"))
a_mask=a_mask[:,:,1]
subset2=df[df['alternative'] ==1 ]
a_text=subset2['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=a_mask)
wc.generate(" ".join(a_text))
plt.axis("off")
plt.title("Words frequented in ALTERNATIVE Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#INDIE
plt.subplot(624)
indie_mask=np.array(Image.open("./indie.jpg"))
indie_mask=indie_mask[:,:,1]
subset3=df[df['indie'] ==1 ]
indie_text=subset3['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=indie_mask)
wc.generate(" ".join(indie_text))
plt.axis("off")
plt.title("Words frequented in INDIE Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#HIPHOP
plt.subplot(625)
hh_mask=np.array(Image.open("./hiphop.jpg"))
hh_mask=hh_mask[:,:,1]
subset4=df[df['hiphop'] ==1 ]
hh_text=subset4['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=hh_mask)
wc.generate(" ".join(hh_text))
plt.axis("off")
plt.title("Words frequented in HIPHOP Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#RAP
plt.subplot(626)
rap_mask=np.array(Image.open("./RAP.jpg"))
rap_mask=rap_mask[:,:,1]
subset11=df[df['rap'] ==1 ]
rap_text=subset11['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=rap_mask)
wc.generate(" ".join(rap_text))
plt.axis("off")
plt.title("Words frequented in RAP Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#HOUSE
plt.subplot(627)
house_mask=np.array(Image.open("./house.jpg"))
house_mask=house_mask[:,:,1]
subset5=df[df['house'] ==1 ]
house_text=subset5['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=house_mask)
wc.generate(" ".join(house_text))
plt.axis("off")
plt.title("Words frequented in HOUSE Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#ELECTRONIC
plt.subplot(628)
elec_mask=np.array(Image.open("./electronic.jpg"))
elec_mask=elec_mask[:,:,1]
subset6=df[df['electronic'] ==1 ]
elec_text=subset6['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=elec_mask)
wc.generate(" ".join(elec_text))
plt.axis("off")
plt.title("Words frequented in ELECTRONIC Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#COUNTRY
plt.subplot(629)
country_mask=np.array(Image.open("./country.jpg"))
country_mask=country_mask[:,:,1]
subset7=df[df['country'] ==1 ]
country_text=subset7['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=country_mask)
wc.generate(" ".join(country_text))
plt.axis("off")
plt.title("Words frequented in COUNTRY Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#METAL
plt.subplot(6,2,10)
metal_mask=np.array(Image.open("./metal.jpg"))
metal_mask=metal_mask[:,:,1]
subset8=df[df['metal'] ==1 ]
metal_text=subset8['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=metal_mask)
wc.generate(" ".join(metal_text))
plt.axis("off")
plt.title("Words frequented in METAL Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#FOLK
plt.subplot(6,2,11)
folk_mask=np.array(Image.open("./folk.jpg"))
folk_mask=folk_mask[:,:,1]
subset9=df[df['folk'] ==1 ]
folk_text=subset9['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=folk_mask)
wc.generate(" ".join(folk_text))
plt.axis("off")
plt.title("Words frequented in FOLK Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
#JAZZ
plt.subplot(6,2,12)
jazz_mask=np.array(Image.open("./JAZZ.jpg"))
jazz_mask=jazz_mask[:,:,1]
subset10=df[df['jazz'] ==1 ]
jazz_text=subset10['Lyrics1'].values
wc= WordCloud(background_color="black",max_words=200,mask=jazz_mask)
wc.generate(" ".join(jazz_text))
plt.axis("off")
plt.title("Words frequented in JAZZ Music", fontsize=20)
plt.imshow(wc.recolor(colormap= 'Reds' , random_state=244), alpha=0.98)
plt.show()